4. Pandas - plotting


In [2]:
%pylab inline
from pandas import Series, DataFrame
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

pandas中的繪圖函數

線型圖


In [3]:
# Series, DataFrame 的 plot()方法 預設是繪製 線型圖
s = Series(np.random.randn(10).cumsum(), index = np.arange(0, 100, 10))
s.plot()


Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x60f05f8>

In [4]:
# Series的索引會被作為subplot的 X軸,可以使用參數 use_index = False 來禁用該功能
# X軸的刻度可以透過 xticks 和 xlim 選向來調整
# Y軸的刻度可以透過 yticks 和 ylim 選向來調整

In [5]:
# DataFrame的 plot()方法會在subplot中為每個 column繪製一條線,並自動創建legend
df = DataFrame(np.random.randn(10, 4).cumsum(0), 
               index = np.arange(0, 100, 10), 
               columns = ['A', 'B', 'C', 'D'])
df.tail(3)


Out[5]:
A B C D
70 0.185760 -0.418143 -1.567216 -3.965365
80 2.700475 -0.807974 -2.625121 -5.042702
90 3.168337 0.033208 -3.949831 -4.925446

In [6]:
df.plot()


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x6116358>

柱狀圖


In [7]:
# 設定 kind = 'bar' 或 'barh' 即可繪製柱狀圖
# Series和 DataFrame的 索引會被當作subplot的 X軸(bar)或 Y軸(barh)

fig, axes = plt.subplots(2, 1)
data = Series(np.random.rand(16), index = list('abcedfghijklmnop'))
data.plot(ax = axes[0], kind = 'bar')
data.plot(ax = axes[1], kind = 'barh')


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x9607630>

In [8]:
df = DataFrame(np.random.rand(6, 4), 
               index = ['one', 'two', 'three', 'four', 'five', 'six'], 
               columns = pd.Index(['A', 'B', 'C', 'D'], name = 'Genus'))
df  
# DataFrame的每一 row的值分為一組


Out[8]:
Genus A B C D
one 0.220608 0.513349 0.946094 0.200548
two 0.846614 0.781777 0.814468 0.534386
three 0.375286 0.143810 0.365030 0.401326
four 0.554727 0.275458 0.071217 0.576424
five 0.048619 0.932931 0.496425 0.361190
six 0.758107 0.034670 0.466343 0.932895

In [9]:
# DataFrame的每一 row的值分為一組
# columns 索引的 name屬性 被用來做為 legend的標題
df.plot(kind = 'bar')


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x96a42b0>

In [10]:
# 設定 stacked = True, 可繪製 堆積柱狀圖
df.plot(kind = 'barh', stacked = True)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x97a4ef0>

In [11]:
s = Series([2, 3, 5, 2, 5, 6, 7, 8, 9, 10, 13, 2, 3, 4, 7, 8, 9, 0, 0, 2, 2, 1])

# 用Series的 value_counts()直接繪製柱狀圖,表達每個數字出現的次數
vc = s.value_counts()
vc.plot(kind = 'bar')


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x98a3320>

In [12]:
tips = pd.read_csv('../data/tips.csv')
tips[:5]


Out[12]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

In [13]:
# 用 crosstab()方法創建一個 交叉表,預設統計 發生的次數(計數)
party_counts = pd.crosstab(tips.day , tips['size'])
party_counts


Out[13]:
size 1 2 3 4 5 6
day
Fri 1 16 1 1 0 0
Sat 2 53 18 13 1 0
Sun 0 39 15 18 3 1
Thur 1 48 4 5 1 3

In [14]:
party_counts.plot(kind = 'bar')


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0xa880a90>

In [15]:
party_counts = party_counts.ix[:, 2:5]
party_counts.plot(kind = 'bar', stacked = True)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0xa97feb8>

In [16]:
party_counts


Out[16]:
size 2 3 4 5
day
Fri 16 1 1 0
Sat 53 18 13 1
Sun 39 15 18 3
Thur 48 4 5 1

In [17]:
party_counts = party_counts.div(party_counts.sum(1), axis = 0)
party_counts


Out[17]:
size 2 3 4 5
day
Fri 0.888889 0.055556 0.055556 0.000000
Sat 0.623529 0.211765 0.152941 0.011765
Sun 0.520000 0.200000 0.240000 0.040000
Thur 0.827586 0.068966 0.086207 0.017241

In [18]:
party_counts.sum(1)


Out[18]:
day
Fri     1.0
Sat     1.0
Sun     1.0
Thur    1.0
dtype: float64

In [19]:
party_counts.plot(kind = 'bar', stacked = True)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0xaa72080>

直方圖(histogram)和密度圖


In [20]:
tips = pd.read_csv('../data/tips.csv')
tips[:5]


Out[20]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4

In [21]:
# 可以用 plot(kind = 'hist') 來繪製直方圖
tips.total_bill.plot(kind = 'hist', bins = 50)
plt.title('total_bill')


Out[21]:
<matplotlib.text.Text at 0xab5d630>

In [22]:
# 也可以用 hist() 來繪製直方圖
tips.total_bill.hist(bins = 50)
plt.title('total_bill')


Out[22]:
<matplotlib.text.Text at 0xac5cf98>

In [23]:
# tip比例 直方圖
tip_ratios = (tips.tip / tips.total_bill)
tip_ratios.hist(bins = 50)
plt.title('tip ratio')


Out[23]:
<matplotlib.text.Text at 0xad86ef0>

散佈圖(scatter plot)


In [24]:
macro = pd.read_csv('../data/macrodata.csv')
macro[:5]


Out[24]:
year quarter realgdp realcons realinv realgovt realdpi cpi m1 tbilrate unemp pop infl realint
0 1959.0 1.0 2710.349 1707.4 286.898 470.045 1886.9 28.98 139.7 2.82 5.8 177.146 0.00 0.00
1 1959.0 2.0 2778.801 1733.7 310.859 481.301 1919.7 29.15 141.7 3.08 5.1 177.830 2.34 0.74
2 1959.0 3.0 2775.488 1751.8 289.226 491.260 1916.4 29.35 140.5 3.82 5.3 178.657 2.74 1.09
3 1959.0 4.0 2785.204 1753.7 299.356 484.052 1931.3 29.37 140.0 4.33 5.6 179.386 0.27 4.06
4 1960.0 1.0 2847.699 1770.5 331.722 462.199 1955.5 29.54 139.6 3.50 5.2 180.007 2.31 1.19

In [25]:
data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
data[:5]


Out[25]:
cpi m1 tbilrate unemp
0 28.98 139.7 2.82 5.8
1 29.15 141.7 3.08 5.1
2 29.35 140.5 3.82 5.3
3 29.37 140.0 4.33 5.6
4 29.54 139.6 3.50 5.2

In [26]:
# diff(): 以上下元素的差異值填入
trans_data = np.log(data).diff().dropna()
trans_data[:5]


Out[26]:
cpi m1 tbilrate unemp
1 0.005849 0.014215 0.088193 -0.128617
2 0.006838 -0.008505 0.215321 0.038466
3 0.000681 -0.003565 0.125317 0.055060
4 0.005772 -0.002861 -0.212805 -0.074108
5 0.000338 0.004289 -0.266946 0.000000

In [27]:
# plt.scatter()可以繪製散佈圖,標示每一個資料row的 兩個columns的數據分布
plt.scatter(trans_data.m1, trans_data.unemp)
plt.title('Changes in log({0}) vs. log({1})'.format('m1', 'unemp'))


Out[27]:
<matplotlib.text.Text at 0xaeacda0>

In [28]:
trans_data.plot.scatter('m1', 'unemp')


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x5952240>

In [29]:
# pandas 提供了 scatter_matrix()函數,方便由DataFrame繪製散佈圖
# 會自動的產生各個columns之間的 scatter diagram
pd.scatter_matrix(trans_data, color = 'k', alpha = 0.3)


Out[29]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000000005977BE0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000005A0FAC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000005A5F4E0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000005A9A198>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000005AE42B0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000005C03860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000005C4CF60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000005C896A0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x0000000005CD88D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000005D22710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x0000000005D67B38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000AEE0D68>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000AF1CEF0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000AF6B9E8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000AFA71D0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000AFF4940>]], dtype=object)

In [ ]: